source(here::here("code/load.R"))

# Load and merge the two datasets
briggs <- read_csv(here("data/estimates_rb_vab_mma.csv")) |>
    mutate(sample = "Briggs",
           study_year = ifelse(is.na(study_year),
                               as.numeric(str_extract(study_id, "[0-9]{4}")),
                               study_year))

doucouliagos_meta_id <- data.table::fread(here("data/doucouliagos_meta_id.csv"))
doucouliagos <- data.table::fread(here("data/estimates_doucouliagos_2021-12-19.csv")) |>
    # type matching with briggs
    transform(study_id = as.character(study_id))

dat <- bind_rows(briggs, doucouliagos) |>
    mutate(z_stat = estimate / std.error,
           dv = ifelse(is.na(dv), "", dv),
           iv = ifelse(is.na(iv), "", iv),
           question_id = ifelse(sample == "Briggs", paste(meta_id, iv, dv, sep = "; "), question_id)) |>
    select(-c(p.value, statistic_t, conf.high, conf.low))


# Drops missing Meta ID.
# Removes these from Chris' data:
# "Get out to vote Direct Mail_With Authors.xlsx", "Get out to vote PHONE with Authors.xlsx", "Get out to vote TEXT_ With Authors.xlsx", "Copy of Get out to vote door to door_With Authors.xlsx", "wage impact of teachers unions.xlsx"
dat <- dat |> filter(!is.na(meta_id))


# Drop problematic observations before computing truth and power 
dat <- dat |>
    filter(!is.na(estimate),
           !is.nan(estimate),
           !is.infinite(estimate),
           !is.na(std.error),
           !is.nan(std.error),
           !is.infinite(std.error),
           # Pure zero standard errors are probably data entry mistakes. This
           # tolerance is arbitrary, but we tested different thresholds and
           # it made no substantive difference. We also checked to see if
           # excluding abs(estimate)<=1e-5 made a difference. It did not.
           std.error > 1e-10) |>
           # mark all rows with n_per_question < 5 for removal.
           # TriWen2020 especially contributed hundred of rows with few obs per question
           group_by(question_id) |>
           add_count() |>
           ungroup() |>
           filter(n >= 5)


# data will eventually be in long format, so we create a unique id for each estimate
# this needs to be done before we compute our various true effects
dat <- dat |> mutate(estimate_id = 1:n())


# Truth estimates
get_top1 <- function(b, se) {
    out <- b[which.min(se)]
    return(out)
}

get_10p <- function(b, se) {
    idx <- which(se <= quantile(se, .1))
    out <- weighted.mean(b[idx], 1 / se[idx]^2)
    return(out)
}

get_fe <- function(b, se) {
    out <- weighted.mean(b, 1 / se^2)
    return(out)
}

get_pet_peese <- function(b, se) {
    precision <- 1 / se^2
    se2 <- se^2
    mod <- lm(b ~ se, weights = precision)
    # Is the intercept significant?
    genuine <- summary(mod)$coefficients["(Intercept)", "Pr(>|t|)"] <= 0.1
    # Constant from regression on SE^2 if genuine
    # Constant from regression on SE if not genuine
    out <- ifelse(
        genuine == TRUE,
        coef(lm(b ~ se2, weights = precision))[1],
        coef(lm(b ~ se, weights = precision))[1])
    return(out)
}

get_uwls <- function(b, se) {
    precision <- 1 / se^2
    mod <- lm(b ~ 1, weights = precision)
    # equivalent to:
    # z <- b / se
    # precision <- 1 / se
    # mod <- lm(z ~ precision - 1)
    out <- coef(mod)[1]
    return(out)
}

get_uwls_p <- function(b, se) {
    precision <- 1 / se^2
    mod <- lm(b ~ 1, weights = precision)
    # equivalent to:
    # z <- b / se
    # precision <- 1 / se
    # mod <- lm(z ~ precision - 1)
    out <- summary(mod)$coefficients[1, "Pr(>|t|)"]
    return(out)
}

get_waap_uwls <- function(b, se, uwls) {
    # p.4 from Stanley, Doucouliagos & Ioannidis (2017, Statistics in Medicine)
    # TODO: I should take the absolute value, right?
    adequate <- se <= abs(uwls) / 2.8
    if (sum(adequate) < 2) {
        out <- unique(uwls)
    } else {
        out <- get_uwls(b[adequate], se[adequate])
    }
    return(out)
}

get_tukey <- function(x) {
    quant <- quantile(x, probs = c(.25, .75), na.rm = TRUE)
    out <- x > quant[2] + 1.5 * IQR(x, na.rm = TRUE) | x < quant[1] - 1.5 * IQR(x, na.rm = TRUE)
    return(out)
}

get_se_ratio <- function(se) {
    max(se) / min(se)
}

get_dfbeta <- function(b, se) {
    tstat <- abs(b / se)
    precision <- 1 / se
    mod <- lm(tstat ~ precision)
    out <- tryCatch(
        abs(dfbeta(mod)[, "precision"]) > 2 / sqrt(nobs(mod)),
        # keep on error because we do not want to drop entire meta-analyses
        error = function(e) rep(FALSE, length(b)))
    return(out)
}

get_fat <- function(b, se) {
    precision <- 1 / se^2
    se2 <- se^2
    mod <- lm(b ~ se, weights = precision)
    out <- coef(mod)[["se"]]
    return(out)
}

get_overestimation <- function(b, uwls) {
    abs((mean(b) - uwls) / uwls) + 1
}

get_power <- function(std.error, truth, alpha = .05, onetail = FALSE) {
    true_z <- abs(truth) / std.error
    if (isTRUE(onetail)) {
        critical_z <- qnorm(1 - alpha)
        # probability of rejecting the null to the right
        p.hi <- 1 - pnorm(critical_z - true_z)
        power <- p.hi
    } else {
        critical_z <- qnorm(1 - alpha / 2)
        # probability of rejecting the null to the right
        p.hi <- 1 - pnorm(critical_z - true_z)
        # probability of rejecting the null to the left
        p.lo <- pnorm(-critical_z - true_z)
        power <- p.lo + p.hi
    }
    return(power)
}

get_typeS <- function(std.error, truth, alpha = .05) {
    critical_z <- qnorm(1 - alpha / 2)
    true_z <- abs(truth) / std.error
    # probability of rejecting the null to the right
    p.hi <- 1 - pnorm(critical_z - true_z)
    # probability of rejecting the null to the left
    p.lo <- pnorm(-critical_z - true_z)
    # power two-tailed
    power <- p.lo + p.hi
    # Type S
    # probability that we reject in the wrong direction divided by
    # the probability that we reject in any direction
    typeS <- p.lo / power
    return(typeS)
}

get_typeM <- function(std.error, truth, alpha = .05) {
    # closed form solutions from: Lu, Qiu, and Deng (2018)
    # code adapted from the `retrodesign` 0.1.0 library, published under MIT license
    critical_z <- qnorm(1 - alpha / 2)
    lambda <- truth / std.error
    typeM <- (dnorm(lambda + critical_z) + dnorm(lambda - critical_z) +
                  lambda*(pnorm(lambda + critical_z) +pnorm(lambda-critical_z) - 1))/
                    (lambda*(1 - pnorm(lambda + critical_z) + pnorm(lambda - critical_z)))
    typeM <- abs(typeM)
    return(typeM)
}

# Make sure our hand-coded functions produce accurate results by comparing them
# to the `retrodesign` library:
# for (truth in c(.1, .3, .5, -.5)) {
#     for (se in 1:4) {
#         assert_true(all.equal(retro_design(truth, se)$power, get_power(se, truth, onetail = FALSE)))
#         assert_true(all.equal(retro_design(truth, se)$typeM, get_typeM(se, truth)))
#         assert_true(all.equal(retro_design(truth, se)$typeS, get_typeS(se, truth)))
#     }
# }

dat <- dat |>
      group_by(question_id) |>
      mutate(
        outlier_tukeyB = get_tukey(estimate),
        outlier_tukeySE = get_tukey(std.error),
        outlier_ratioSE = get_se_ratio(std.error),
        outlier_dfbeta = get_dfbeta(estimate, std.error),
        outlier_fat = get_fat(estimate, std.error),
        uwls = get_uwls(estimate, std.error),
        uwls_p = get_uwls_p(estimate, std.error),
        waapuwls = get_waap_uwls(estimate, std.error, uwls),
        petpeese = get_pet_peese(estimate, std.error),
        uwls_5x = get_uwls(estimate * 5, std.error),
        waapuwls_5x = get_waap_uwls(estimate * 5, std.error, uwls_5x),
        petpeese_5x = get_pet_peese(estimate * 5, std.error),
        uwls_tukeyB = get_uwls(estimate[!outlier_tukeyB], std.error[!outlier_tukeyB]),
        uwls_tukeySE = get_uwls(estimate[!outlier_tukeySE], std.error[!outlier_tukeySE]),
        uwls_dfbeta = get_uwls(estimate[!outlier_dfbeta], std.error[!outlier_dfbeta]),
        power_uwls = get_power(std.error, uwls),
        power_waapuwls = get_power(std.error, waapuwls),
        power_petpeese = get_power(std.error, petpeese),
        power_uwls_1tail = get_power(std.error, uwls, onetail = TRUE),
        power_waapuwls_1tail = get_power(std.error, waapuwls, onetail = TRUE),
        power_petpeese_1tail = get_power(std.error, petpeese, onetail = TRUE),
        power_uwls_5x = get_power(std.error, uwls_5x),
        power_waapuwls_5x = get_power(std.error, waapuwls_5x),
        power_petpeese_5x = get_power(std.error, petpeese_5x),
        power_uwls_tukeyB = ifelse(outlier_tukeyB, NA, get_power(std.error[!outlier_tukeyB], uwls_tukeyB[!outlier_tukeyB])),
        power_uwls_tukeySE = ifelse(outlier_tukeySE, NA, get_power(std.error[!outlier_tukeySE], uwls_tukeySE[!outlier_tukeySE])),
        power_uwls_dfbeta = ifelse(outlier_dfbeta, NA, get_power(std.error[!outlier_dfbeta], uwls_dfbeta[!outlier_dfbeta])),
        typeS_uwls = get_typeS(std.error, uwls),
        typeM_uwls = get_typeM(std.error, uwls),
        typeS_waapuwls = get_typeS(std.error, waapuwls),
        typeM_waapuwls = get_typeM(std.error, waapuwls),
        typeS_petpeese = get_typeS(std.error, petpeese),
        typeM_petpeese = get_typeM(std.error, petpeese),
        overestimation = get_overestimation(estimate, uwls)) |>
      ungroup()

#rename subfields
dat <- dat |>
  mutate(subfield = case_when(subfield == "AP" ~ "American Politics",
                              subfield == "CP" ~ "Comparative Politics",
                              subfield == "IR" ~ "International Relations",
                              subfield == "PA" ~ "Public Administration",
                              subfield == "PE" ~ "Political Economy",
                              TRUE ~ "Other"))

# clean before save
dat <- dat |>
    mutate(question_id = gsub("; ; $", "", question_id),
           question_id = gsub("; ; ", "; ", question_id)) %>%
    select(meta_id,
           question_id,
           estimate_id,
           estimate,
           std.error,
           z_stat,
           uwls,
           waapuwls,
           petpeese,
           sort(colnames(.)))

# Save
write_csv(dat, here("data/estimates.csv"))
